Portfolio 3 - Deep Learning |
||||
| Coach | Opdrachtnummer | Namen groepsleden | ||
|---|---|---|---|---|
| Vikram en Bob | 3 |
|
||
In dit notebook gaan we aan de slag met drie onderdelen. In het eerste deel gaan we categorienen voorspellen van de artikelen envenals de coordinaten van de locatie binnen de afbeelding. In deel twee wordt er een model getraind om tekstbijschriften te genereren die kledingstukken beschrijven met behulp van relevante modetermen. In het laatste deel wordt een model gefinetuned om afbeeldingen van kleding te generen op basis van tekstbeschrijvingen.
import os
import glob as glob
import random
import io
import cv2
import datasets
from datasets import load_dataset, Image
import pandas as pd
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from PIL import Image
from collections import defaultdict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.applications import efficientnet
from tensorflow.keras.models import Sequential
nltk.download('stopwords')
WARNING:tensorflow:From C:\Users\Gebruiker\anaconda3\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Gebruiker\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
# functie om bounding boxes te converteren in YOLO format xmin, ymin, xmax, ymax.
def yolo2bbox(bboxes):
xmin, ymin = bboxes[0]-bboxes[2]/2, bboxes[1]-bboxes[3]/2
xmax, ymax = bboxes[0]+bboxes[2]/2, bboxes[1]+bboxes[3]/2
return xmin, ymin, xmax, ymax
class_names = ["sunglass", "hat", "jacket", "shirt", "pants", "shorts", "skirt", "dress", "bag", "shoe"]
def plot_box(image, bboxes, labels):
# Need the image height and width to denormalize the bounding box coordinates.
h, w = image.shape[:2]
for box_num, box in enumerate(bboxes):
# Obtain top_left and bottom_right corners.
x1, y1, x2, y2 = yolo2bbox(box)
# denormalize the coordinates.
xmin = int(x1*w)
ymin = int(y1*h)
xmax = int(x2*w)
ymax = int(y2*h)
# Width and height of the bounding box.
width = xmax - xmin
height = ymax - ymin
class_name = class_names[int(labels[box_num])]
# Render bounding box around the object.
cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color=(0,0,255), thickness=max(2, int(w/275)))
# Render labels.
font_scale = min(1, max(4,int(w/500)))
font_thickness = min(1, max(10,int(w/50)))
p1, p2 = (int(xmin), int(ymin)), (int(xmax), int(ymax))
# Text width and height.
tw, th = cv2.getTextSize(class_name, 0, fontScale=font_scale, thickness=font_thickness)[0]
p2 = p1[0] + tw, p1[1] + -th - 10
cv2.rectangle(image, p1, p2, color=(255,0,0), thickness=-1)
cv2.putText(image, class_name, (xmin+1, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
return image
def visualize(image_paths, label_paths, num_samples):
all_training_images = glob.glob(image_paths)
all_training_labels = glob.glob(label_paths)
all_training_images.sort()
all_training_labels.sort()
num_images = len(all_training_images)
plt.figure(figsize=(20, 17))
for i in range(num_samples):
j = random.randint(0,num_images-1)
image = cv2.imread(all_training_images[j])
with open(all_training_labels[j], 'r') as f:
bboxes = []
labels = []
label_lines = f.readlines()
for label_line in label_lines:
label = label_line[0]
bbox_string = label_line[2:]
x_c, y_c, w, h = bbox_string.split(' ')
x_c = float(x_c)
y_c = float(y_c)
w = float(w)
h = float(h)
bboxes.append([x_c, y_c, w, h])
labels.append(label)
result_image = plot_box(image, bboxes, labels)
plt.subplot(2, 2, i+1)
plt.imshow(result_image[:, :, ::-1])
plt.axis('off')
plt.subplots_adjust(wspace=0)
plt.tight_layout()
plt.show()
visualize(
image_paths='Train/images/*',
label_paths='Train/labels/*',
num_samples=4,
)
In deze visualisatie worden den boundingboxes in de fotos laten zien met hun bijbehorende klasse naam
def analyze_dataset(image_paths, label_paths, class_names):
all_training_images = glob.glob(image_paths)
all_training_labels = glob.glob(label_paths)
if len(all_training_images) == 0 or len(all_training_labels) == 0:
raise ValueError("No images or labels found with the given paths.")
all_training_images.sort()
all_training_labels.sort()
category_count = defaultdict(int)
for label_path in all_training_labels:
with open(label_path, 'r') as f:
label_lines = f.readlines()
for label_line in label_lines:
label = label_line.split()[0]
category_count[int(label)] += 1
plt.figure(figsize=(10, 6))
category_labels = [class_names[i] for i in sorted(category_count.keys())]
category_values = [category_count[i] for i in sorted(category_count.keys())]
plt.bar(category_labels, category_values, color='skyblue')
plt.xlabel('Categorieën')
plt.ylabel('Aantal')
plt.title('Verdeling van Categorieën in de Hele Dataset')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
image_paths = 'Train/images/*'
label_paths = 'Train/labels/*'
# Analyseer en visualiseer de categorieën in de hele dataset
analyze_dataset(image_paths, label_paths, class_names)
In deze grafiek zie je hoe vaak een artikel voor komt in een foto. Je ziet dat schoenen het vaaktst voorkomen daarna shirts opgevolgd door bag.
De data voor image captioning en beeldgeneratie komt uit een huggingface dataset, H & M Fashion Caption 12k.
Om de data in te kunnen laden heb je een token nodig van huggingface.
from huggingface_hub import HfFolder
from huggingface_hub import whoami
os.environ['HF_TOKEN'] = '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' # Put your token here (https://huggingface.co/settings/tokens)
# Load the token from the environment variable
hf_token = os.getenv("HF_TOKEN")
# Authenticate with Hugging Face using the token
HfFolder.save_token(hf_token)
# Verify the authentication (optional)
user = whoami()
print(f"Authenticated as: {user['name']}")
Authenticated as: ThugWithin
De data wordt ingeladen met load_dataset( ). De dataset bevat alleen een train set en hierin zitten de kollomen 'text' en 'image' in met 12437 rows. Om de data verder te kunnen analyseren heb ik de data eerst omgezet naar een pandas dataframe met '.to_pandas( )'. De 'image' is omgezet naar een dictionary, die bevat de 'bytes' en 'path' keys. De 'bytes' is de image in bytes en de 'path' is de naam van de image. De 'text' bevat de captions.
# Load huggingface dataset
dataset = load_dataset('tomytjandra/h-and-m-fashion-caption-12k')
Downloading readme: 0%| | 0.00/323 [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/478M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/465M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/418M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/335M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/331M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/321M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/307M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/307M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/319M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/304M [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/297M [00:00<?, ?B/s]
Generating train split: 0%| | 0/12437 [00:00<?, ? examples/s]
print(dataset.keys())
dict_keys(['train'])
print(dataset['train'])
Dataset({
features: ['text', 'image'],
num_rows: 12437
})
# Convert the huggingface dataset to a pandas dataframe'
df = dataset['train'].to_pandas()
print(df['image'][0].keys())
dict_keys(['bytes', 'path'])
print(df['image'][0]['path'])
0116379047.jpg
print(df['text'][0])
solid dark blue fitted top in soft stretch jersey with a wide neckline and long sleeves
Een nieuwe directory wordt gemaakt voor de images met een custom filename.
directory = 'Images'
os.makedirs(directory, exist_ok=True)
# nummering for the image filenames
num_images = len(df['image'])
num_digits = len(str(num_images))
for index, image_dict in enumerate(df['image']):
jpeg_data = image_dict['bytes']
image_name = f'image{str(index).zfill(num_digits)}.jpg'
with open(os.path.join('Images', image_name), 'wb') as f:
f.write(jpeg_data)
images = os.listdir(directory) # list image filenames
images = sorted(images) # sort images
Ik maak een nieuwe dataframe met de custom image filenames, image width, image height, filepath en captions.
dimensions = []
for image_filename in images:
image_path = os.path.join(directory, image_filename)
with Image.open(image_path) as img:
width, height = img.size
dimensions.append((image_filename, width, height))
dataframe = pd.DataFrame(dimensions, columns=['filename', 'width', 'height'])
dataframe['file_path'] = [f'Images/{filename}'for filename in dataframe['filename']]
dataframe['caption'] = df['text']
dataframe.head(10)
| filename | width | height | file_path | caption | |
|---|---|---|---|---|---|
| 0 | image00000.jpg | 1531 | 1750 | Images/image00000.jpg | solid dark blue fitted top in soft stretch jer... |
| 1 | image00001.jpg | 772 | 1750 | Images/image00001.jpg | melange dark grey trousers in sweatshirt fabri... |
| 2 | image00002.jpg | 1166 | 1750 | Images/image00002.jpg | solid black leggings in soft jersey with a wid... |
| 3 | image00003.jpg | 1166 | 1750 | Images/image00003.jpg | solid black short padded jacket with a jersey-... |
| 4 | image00004.jpg | 1166 | 1750 | Images/image00004.jpg | solid black harem trousers in jersey with elas... |
| 5 | image00005.jpg | 1535 | 1750 | Images/image00005.jpg | melange light blue top in soft slub jersey wit... |
| 6 | image00006.jpg | 1166 | 1750 | Images/image00006.jpg | solid black 34-length leggings in soft organic... |
| 7 | image00007.jpg | 987 | 1750 | Images/image00007.jpg | stripe white short-sleeved dress in slub jerse... |
| 8 | image00008.jpg | 1543 | 1750 | Images/image00008.jpg | check dark purple shirt in checked flannel wit... |
| 9 | image00009.jpg | 1166 | 1750 | Images/image00009.jpg | solid greenish khaki joggers in washed cotton ... |
# The first 10 images and captions
for i, row in dataframe[:10].iterrows():
image = mpimg.imread(row['file_path'])
caption = row['caption']
plt.imshow(image)
plt.axis('off')
plt.show()
print(caption)
print()
solid dark blue fitted top in soft stretch jersey with a wide neckline and long sleeves
melange dark grey trousers in sweatshirt fabric with an elasticated drawstring waist side pockets a back pocket and ribbed hems soft brushed inside
solid black leggings in soft jersey with a wide panel at the waist for best fit over the tummy the cotton content of the leggings is organic
solid black short padded jacket with a jersey-lined hood and stand-up collar with a drawstring zip down the front flap side pockets with a press-stud one inner pocket and ribbing at the cuffs and hem quilted lining
solid black harem trousers in jersey with elastication at the waist and hems
melange light blue top in soft slub jersey with a fake front pocket short sleeves with sewn-in turn-ups and a rounded hem
solid black 34-length leggings in soft organic cotton jersey with an elasticated waist
stripe white short-sleeved dress in slub jersey with an elasticated waist and side pockets
check dark purple shirt in checked flannel with a collar and chest pocket regular fit
solid greenish khaki joggers in washed cotton twill with an elasticated drawstring waist side and back pockets and elasticated hems loose fit
Nieuwe kollomen gemaakt voor de dataframe met behulp van functies. Eerst kollomen gemaakt met de caption met de stopwords in en dan zonder stopwords.
def tokenize(caption):
# Convert to lowercase
caption = caption.lower()
# Remove punctuation
caption = re.sub(r'[^\w\s]', '', caption)
# Tokenize (split into words)
words = caption.split()
return words
def clean_caption(caption):
# Convert to lowercase
caption = caption.lower()
# Remove special characters except spaces
caption = re.sub(r'[^a-zA-Z\s]', '', caption)
# Remove extra whitespaces
caption = re.sub(r'\s+', ' ', caption).strip()
return caption
def remove_stopwords(tokens):
# Remove stop words
stop_words = stopwords.words('english')
filtered_tokens = [word for word in tokens if word not in stop_words]
return filtered_tokens
def text_features(dataframe, text_col):
dataframe['tokens'] = dataframe[text_col].apply(tokenize)
dataframe['count_tokens'] = [len(tokens) for tokens in dataframe['tokens']]
dataframe['cleaned_text'] = dataframe[text_col].apply(clean_caption)
dataframe['cleaned_tokens'] = dataframe['cleaned_text'].apply(tokenize).apply(remove_stopwords)
dataframe['count_cleaned_tokens'] = [len(tokens) for tokens in dataframe['cleaned_tokens']]
return dataframe
dataframe = text_features(dataframe, 'caption')
dataframe.head(10)
| filename | width | height | file_path | caption | tokens | count_tokens | cleaned_text | cleaned_tokens | count_cleaned_tokens | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | image00000.jpg | 1531 | 1750 | Images/image00000.jpg | solid dark blue fitted top in soft stretch jer... | [solid, dark, blue, fitted, top, in, soft, str... | 16 | solid dark blue fitted top in soft stretch jer... | [solid, dark, blue, fitted, top, soft, stretch... | 12 |
| 1 | image00001.jpg | 772 | 1750 | Images/image00001.jpg | melange dark grey trousers in sweatshirt fabri... | [melange, dark, grey, trousers, in, sweatshirt... | 23 | melange dark grey trousers in sweatshirt fabri... | [melange, dark, grey, trousers, sweatshirt, fa... | 18 |
| 2 | image00002.jpg | 1166 | 1750 | Images/image00002.jpg | solid black leggings in soft jersey with a wid... | [solid, black, leggings, in, soft, jersey, wit... | 27 | solid black leggings in soft jersey with a wid... | [solid, black, leggings, soft, jersey, wide, p... | 15 |
| 3 | image00003.jpg | 1166 | 1750 | Images/image00003.jpg | solid black short padded jacket with a jersey-... | [solid, black, short, padded, jacket, with, a,... | 37 | solid black short padded jacket with a jerseyl... | [solid, black, short, padded, jacket, jerseyli... | 24 |
| 4 | image00004.jpg | 1166 | 1750 | Images/image00004.jpg | solid black harem trousers in jersey with elas... | [solid, black, harem, trousers, in, jersey, wi... | 13 | solid black harem trousers in jersey with elas... | [solid, black, harem, trousers, jersey, elasti... | 8 |
| 5 | image00005.jpg | 1535 | 1750 | Images/image00005.jpg | melange light blue top in soft slub jersey wit... | [melange, light, blue, top, in, soft, slub, je... | 22 | melange light blue top in soft slub jersey wit... | [melange, light, blue, top, soft, slub, jersey... | 16 |
| 6 | image00006.jpg | 1166 | 1750 | Images/image00006.jpg | solid black 34-length leggings in soft organic... | [solid, black, 34length, leggings, in, soft, o... | 13 | solid black length leggings in soft organic co... | [solid, black, length, leggings, soft, organic... | 10 |
| 7 | image00007.jpg | 987 | 1750 | Images/image00007.jpg | stripe white short-sleeved dress in slub jerse... | [stripe, white, shortsleeved, dress, in, slub,... | 14 | stripe white shortsleeved dress in slub jersey... | [stripe, white, shortsleeved, dress, slub, jer... | 10 |
| 8 | image00008.jpg | 1543 | 1750 | Images/image00008.jpg | check dark purple shirt in checked flannel wit... | [check, dark, purple, shirt, in, checked, flan... | 15 | check dark purple shirt in checked flannel wit... | [check, dark, purple, shirt, checked, flannel,... | 11 |
| 9 | image00009.jpg | 1166 | 1750 | Images/image00009.jpg | solid greenish khaki joggers in washed cotton ... | [solid, greenish, khaki, joggers, in, washed, ... | 22 | solid greenish khaki joggers in washed cotton ... | [solid, greenish, khaki, joggers, washed, cott... | 17 |
# amount unique tokens without stopwords
all_tokens = [word for sub_list in dataframe['cleaned_tokens'] for word in sub_list]
all_unique_tokens = set(all_tokens)
most_common_token = Counter(all_tokens).most_common(1)[0]
print(f'Most common token: {most_common_token[0]}, {most_common_token[1]}')
print(f'Amount unique tokens: {len(all_unique_tokens)}')
Most common token: solid, 6805 Amount unique tokens: 1409
# Width count
print(Counter(dataframe['width']).most_common(5))
# Meest voorkomende width is 1166
[(1166, 9950), (1167, 34), (1314, 9), (1440, 7), (1477, 7)]
dataframe['width'].plot(kind='hist', bins=20, title='Width')
plt.gca().spines[['top', 'right',]].set_visible(False)
sns.countplot(data=dataframe, x='height').set_title('Height')
plt.show()
# Meest voorkomende height is 1750
dataframe['count_tokens'].min()
12
------------------------------markdown?--------------------------
dataframe['count_tokens'].plot(kind='hist', bins=20, title='Count Tokens met Stopwords')
plt.gca().spines[['top', 'right',]].set_visible(False)
dataframe['count_cleaned_tokens'].plot(kind='hist', bins=20, title='Count Tokens without Stopwords')
plt.gca().spines[['top', 'right',]].set_visible(False)
Eerst splitsen we de data in een train, validatie en test set. De validatie set krijg 20% van de data en de test set krijgt 10% van de data.
def create_validation_and_test_set(dataset_dir, validation_dir, test_dir, validation_split=0.2, test_split=0.1):
# Ensure the validation and test directories exist
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
# Create subdirectories for images and labels within validation and test directories
validation_images_dir = os.path.join(validation_dir, 'images')
validation_labels_dir = os.path.join(validation_dir, 'labels')
os.makedirs(validation_images_dir, exist_ok=True)
os.makedirs(validation_labels_dir, exist_ok=True)
test_images_dir = os.path.join(test_dir, 'images')
test_labels_dir = os.path.join(test_dir, 'labels')
os.makedirs(test_images_dir, exist_ok=True)
os.makedirs(test_labels_dir, exist_ok=True)
# List all image files in the dataset directory
image_dir = os.path.join(dataset_dir, 'images')
image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
# Shuffle the image files
random.shuffle(image_files)
# Determine number of images for validation and test sets
num_total = len(image_files)
num_validation = int(num_total * validation_split)
num_test = int(num_total * test_split)
# Split images into training, validation, and test sets
validation_images = image_files[:num_validation]
test_images = image_files[num_validation:num_validation + num_test]
train_images = image_files[num_validation + num_test:]
def move_files(image_list, dest_images_dir, dest_labels_dir):
for image in image_list:
# Move image file
src_image = os.path.join(image_dir, image)
dest_image = os.path.join(dest_images_dir, image)
shutil.copyfile(src_image, dest_image)
# Move corresponding label file if exists
label_file = os.path.splitext(image)[0] + '.txt'
src_label = os.path.join(dataset_dir, 'labels', label_file)
dest_label = os.path.join(dest_labels_dir, label_file)
if os.path.exists(src_label):
shutil.copyfile(src_label, dest_label)
else:
print(f"Warning: Label file '{src_label}' not found.")
# Move files to validation and test directories
move_files(validation_images, validation_images_dir, validation_labels_dir)
move_files(test_images, test_images_dir, test_labels_dir)
print(f"Validation set created with {num_validation} images.")
print(f"Test set created with {num_test} images.")
print(f"Training set remains with {len(train_images)} images.")
if __name__ == "__main__":
dataset_dir = "Train"
validation_dir = "Val"
test_dir = "Test"
validation_split = 0.2
test_split = 0.1
# Create validation and test sets
create_validation_and_test_set(dataset_dir, validation_dir, test_dir, validation_split, test_split)
Vervolgens maken wij een yaml code in visual studio code, deze ziet er als voglt uit:
data2.yaml
train: '../Train/images'
val: '../Val/images'
test: '../Test/images'
nc: 10
names:
- 'sunglass'
- 'hat'
- 'jacket'
- 'shirt'
- 'pants'
- 'shorts'
- 'skirt'
- 'dress'
- 'bag'
- 'shoe'
Voor de objectdetectie maken we gebruik van het yolov5 model. Dit model doet voorspelling door een combinatie van deep learning techtnieken te gebruiken om objecten in een afbeelding te detecteren en classificeren. Het yolo netwerk bestaat uit drie onderdelen. De backbone, neck en head. In de backbone zit een CNN die de features uit de fotos extraheert en dimensinsies reduceerd. In de neck zitten lagen die de foto features combineren tot meer complexe representtaties. De head krijgt de features van de neck deel en deze maakt de bounding box voorspellingen. (Solawetz, 2024)
if not os.path.exists('yolov5'):
!git clone https://github.com/ultralytics/yolov5.git
Hier clonen we de yolov5 map in onze directory
%cd yolov5/
!pwd
C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5
'pwd' is not recognized as an internal or external command, operable program or batch file.
Nu gaan we het model trainen.
!python train.py --data ../data2.yaml --weights yolov5m.pt --img 640 --epochs {1} --batch-size 32
WARNING:tensorflow:From C:\Users\Gebruiker\anaconda3\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead. train: weights=yolov5m.pt, cfg=, data=../data2.yaml, hyp=data\hyps\hyp.scratch-low.yaml, epochs=1, batch_size=32, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, evolve_population=data\hyps, resume_evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs\train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest, ndjson_console=False, ndjson_file=False github: up to date with https://github.com/ultralytics/yolov5 fatal: cannot change to 'C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio': No such file or directory YOLOv5 2024-6-16 Python-3.9.12 torch-2.3.1+cpu CPU hyperparameters: lr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=0.5, cls_pw=1.0, obj=1.0, obj_pw=1.0, iou_t=0.2, anchor_t=4.0, fl_gamma=0.0, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, flipud=0.0, fliplr=0.5, mosaic=1.0, mixup=0.0, copy_paste=0.0 Comet: run 'pip install comet_ml' to automatically track and visualize YOLOv5 runs in Comet TensorBoard: Start with 'tensorboard --logdir runs\train', view at http://localhost:6006/ Overriding model.yaml nc=80 with nc=10 from n params module arguments 0 -1 1 5280 models.common.Conv [3, 48, 6, 2, 2] 1 -1 1 41664 models.common.Conv [48, 96, 3, 2] 2 -1 2 65280 models.common.C3 [96, 96, 2] 3 -1 1 166272 models.common.Conv [96, 192, 3, 2] 4 -1 4 444672 models.common.C3 [192, 192, 4] 5 -1 1 664320 models.common.Conv [192, 384, 3, 2] 6 -1 6 2512896 models.common.C3 [384, 384, 6] 7 -1 1 2655744 models.common.Conv [384, 768, 3, 2] 8 -1 2 4134912 models.common.C3 [768, 768, 2] 9 -1 1 1476864 models.common.SPPF [768, 768, 5] 10 -1 1 295680 models.common.Conv [768, 384, 1, 1] 11 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest'] 12 [-1, 6] 1 0 models.common.Concat [1] 13 -1 2 1182720 models.common.C3 [768, 384, 2, False] 14 -1 1 74112 models.common.Conv [384, 192, 1, 1] 15 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest'] 16 [-1, 4] 1 0 models.common.Concat [1] 17 -1 2 296448 models.common.C3 [384, 192, 2, False] 18 -1 1 332160 models.common.Conv [192, 192, 3, 2] 19 [-1, 14] 1 0 models.common.Concat [1] 20 -1 2 1035264 models.common.C3 [384, 384, 2, False] 21 -1 1 1327872 models.common.Conv [384, 384, 3, 2] 22 [-1, 10] 1 0 models.common.Concat [1] 23 -1 2 4134912 models.common.C3 [768, 768, 2, False] 24 [17, 20, 23] 1 60615 models.yolo.Detect [10, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [192, 384, 768]] Traceback (most recent call last): File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\train.py", line 848, in <module> main(opt) File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\train.py", line 623, in main train(opt.hyp, opt, device, callbacks) File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\train.py", line 189, in train model = Model(cfg or ckpt["model"].yaml, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\yolo.py", line 248, in __init__ m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))]) # forward File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\yolo.py", line 247, in <lambda> forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x) File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\yolo.py", line 263, in forward return self._forward_once(x, profile, visualize) # single-scale inference, train File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\yolo.py", line 167, in _forward_once x = m(x) # run File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\common.py", line 238, in forward return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\common.py", line 86, in forward return self.act(self.bn(self.conv(x))) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\conv.py", line 460, in forward return self._conv_forward(input, self.weight, self.bias) File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\nn\modules\conv.py", line 456, in _conv_forward return F.conv2d(input, weight, bias, self.stride, RuntimeError: could not create a primitive
Wanneer het model getraind is gaan we het evalueren
!python val.py --task test --weights runs/train/exp/weights/best.pt --data ../data2.yaml
val: data=../data2.yaml, weights=['runs/train/exp/weights/best.pt'], batch_size=32, imgsz=640, conf_thres=0.001, iou_thres=0.6, max_det=300, task=test, device=, workers=8, single_cls=False, augment=False, verbose=False, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs\val, name=exp, exist_ok=False, half=False, dnn=False
fatal: cannot change to 'C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio': No such file or directory
YOLOv5 2024-6-16 Python-3.9.12 torch-2.3.1+cpu CPU
Traceback (most recent call last):
File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\val.py", line 438, in <module>
main(opt)
File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\val.py", line 409, in main
run(**vars(opt))
File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\utils\_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\val.py", line 165, in run
model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\common.py", line 467, in __init__
model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse)
File "C:\Users\Gebruiker\Documents\GitHub\Deep-learning-\Portfolio opdracht 3\yolov5\models\experimental.py", line 98, in attempt_load
ckpt = torch.load(attempt_download(w), map_location="cpu") # load
File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\serialization.py", line 997, in load
with _open_file_like(f, 'rb') as opened_file:
File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\serialization.py", line 444, in _open_file_like
return _open_file(name_or_buffer, mode)
File "C:\Users\Gebruiker\anaconda3\lib\site-packages\torch\serialization.py", line 425, in __init__
super().__init__(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: 'runs\\train\\exp\\weights\\best.pt'
Tot slot voeren we een inferentie uit.
Daarna worden de resultaten geplot
# Run inference.
!python detect.py --source ../inference --weights runs/train/exp/weights/best.pt --project inference_results --name .
plt.figure(figsize=(25,15))
for i in range(3):
img = cv2.imread('./inference_results/fashion-test{}.jpg'.format(i+1))
plt.subplot(1, 3, i+1);
plt.imshow(img[...,::-1]);
Om de een of andere reden lukt het niet helemaal om deze code te runnen. Er zijn verschillende dingen geprobeerd maar het is niet gelukt om eruit te komen. Hieronder hebben we geprobeerd om onze data op alleen het voorafgetrainde yolov5 model te runnen zonder deze zelf te trainen.
import torch
from matplotlib import pyplot as plt
# Model
model = torch.hub.load("ultralytics/yolov5", "yolov5s") # Load YOLOv5 model (suggested: yolov5s)
# Define images (you can use URLs, file paths, PIL images, OpenCV images, numpy arrays, or a list of any of these)
imgs = [
'../Train/images/1.jpg',
'../Train/images/51.jpg',
'../Train/images/71.jpg',
]
# Perform inference on each image
results = model(imgs)
# Display results
results.show() # This will display all images with bounding boxes and labels
# Show the images using Matplotlib (optional)
plt.figure(figsize=(20, 10))
for i, img_path in enumerate(imgs):
plt.subplot(1, len(imgs), i + 1)
img = plt.imread(img_path)
plt.imshow(img)
plt.axis('off')
plt.show()
Using cache found in C:\Users\Gebruiker/.cache\torch\hub\ultralytics_yolov5_master YOLOv5 2024-6-16 Python-3.9.12 torch-2.3.1+cpu CPU Fusing layers... YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs Adding AutoShape...
Wanneer het yolov5 model op onze eigen data goed kon trainen zou die de personen en autos niet mee nemen maar wel de kledingstukken die de vrouwen aan hebben en niet alleen de handtassen.
Maak een text file waarbij de caption met de image in 1 lijn zit gesplitst met een tab.
caption_dict = dataframe[['filename','caption']].set_index('filename').to_dict()['caption']
os.makedirs('Captions', exist_ok=True)
filepath = os.path.join('Captions', 'captions.txt')
with open(filepath, 'w') as f:
for filename, caption in caption_dict.items():
f.write(f'{filename}\t{caption}\n')
Hyperparameters
IMAGE_SIZE: EfficientNetB0 gebruikt een dimensie van 224 bij 224. (Team (z.d.))
VOCAB_SIZE: Meest voorkomende token excluded stopwords komt 6805 keren voor. Dus heb besloten om voor een vocab size van 10000 te kiezen. ChatGPT: Choosing Deep Learning Vocabulary
SEQ_LENGTH: 17 tokens heeft het hoogste frequentie bij aantal tokens zonder stopwords, vandaar we voor 20 hebben gekozen.
EMBED_DIM: 512 is een goede optie als dimensie voor image en token embeddings voor een complex probleem. ChatGPT: Choosing Deep Learning Vocabulary
FF_DIM: Hetzelfde als EMBED_DIM.
BATCH_SIZE: Een hogere batch size kan een hogere loss geven, waarbij de accuracy dan lager zit. (Devansh, 2023)
EPOCHS: 30. We willen niet een te hoge waarde nemen, waar overfitting plaats kan vinden. Maar ook geen lage waarde, waar underfitting plaats kan vinden.
Metrics
Accuracy: Bij deze situatie letten we niet op of de voorspelde omschrijving grammaticaal correct is, maar alleen op of de juiste eigenschappen voorkomen in de voorspellingen, zoals het type hals, de mouwen, en de kleuren. Voor dit doel is accuracy voldoende (ChatGPT: Accuracy vs Creativity in Fashion).
IMAGES_PATH = "Images"
# Desired image dimensions for EfficientNetB0
IMAGE_SIZE = (224, 224)
# Vocabulary size
VOCAB_SIZE = 10000
# Fixed length allowed for any sequence
SEQ_LENGTH = 17
# Dimension for the image embeddings and token embeddings
EMBED_DIM = 512
# Per-layer units in the feed-forward network
FF_DIM = 512
# Other training parameters
BATCH_SIZE = 64
EPOCHS = 30
AUTOTUNE = tf.data.AUTOTUNE
def load_captions_data(filename):
"""Loads captions (text) data and maps them to corresponding images.
Args:
filename: Path to the text file containing caption data.
Returns:
caption_mapping: Dictionary mapping image names and the corresponding captions
text_data: List containing all the available captions
"""
with open(filename) as caption_file:
caption_data = caption_file.readlines()
caption_mapping = {}
text_data = []
image_paths = []
images_to_skip = set()
for line in caption_data:
line = line.rstrip("\n")
# Image name and captions are separated using a tab
img_name, caption = line.split("\t")
img_name = os.path.join(IMAGES_PATH, img_name.strip())
# We will remove caption that are either too short to too long
tokens = caption.strip().split()
if len(tokens) < 5 or len(tokens) > SEQ_LENGTH:
images_to_skip.add(img_name)
continue
if img_name.endswith("jpg") and img_name not in images_to_skip:
# We will add a start and an end token to each caption
caption = "<start> " + caption.strip() + " <end>"
text_data.append(caption)
image_paths.append(img_name)
if img_name in caption_mapping:
caption_mapping[img_name].append(caption)
else:
caption_mapping[img_name] = [caption]
for img_name in images_to_skip:
if img_name in caption_mapping:
del caption_mapping[img_name]
return caption_mapping, text_data, image_paths
def train_val_split(caption_data, train_size=0.8, shuffle=True):
"""Split the captioning dataset into train and validation sets.
Args:
caption_data (dict): Dictionary containing the mapped caption data
train_size (float): Fraction of all the full dataset to use as training data
shuffle (bool): Whether to shuffle the dataset before splitting
Returns:
Traning and validation datasets as two separated dicts
"""
# 1. Get the list of all image names
all_images = list(caption_data.keys())
# 2. Shuffle if necessary
if shuffle:
np.random.shuffle(all_images)
# 3. Split into training and validation sets
train_size = int(len(caption_data) * train_size)
training_data = {
img_name: caption_data[img_name] for img_name in all_images[:train_size]
}
validation_data = {
img_name: caption_data[img_name] for img_name in all_images[train_size:]
}
# 4. Return the splits
return training_data, validation_data
# Load the dataset
captions_mapping, text_data, image_paths = load_captions_data("/content/Captions/captions.txt")
# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))
Number of training samples: 4292 Number of validation samples: 1074
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~1234567890"
vectorization = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode="int",
output_sequence_length=SEQ_LENGTH,
standardize=custom_standardization,
)
vectorization.adapt(text_data)
def decode_and_resize(img_path):
img = tf.io.read_file(img_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMAGE_SIZE)
img = tf.image.convert_image_dtype(img, tf.float32)
return img
def process_input(img_path, captions):
return decode_and_resize(img_path), vectorization(captions)
def make_dataset(images, captions):
dataset = tf.data.Dataset.from_tensor_slices((images, captions))
dataset = dataset.shuffle(BATCH_SIZE * 8)
dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)
return dataset
# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
Image Augmentation
Flip: mirror versie van kleding ziet bijna hetzelfde uit, dus een horizontale flip is een goede optie hier.
Rotation: een klein rotatie, sinds niet alle kleren op de foto's precies recht eruit ziet.
Contrast: Invloed van verlichting of filters.
image_augmentation = Sequential(
[
tf.keras.layers.RandomFlip("horizontal"),
tf.keras.layers.RandomRotation(0.2),
tf.keras.layers.RandomContrast(0.3),
]
)
Model
Het model bestaat uit 3 delen, een CNN feature extractor, een Transformer encoder en een Transformer decoder.
CNN feature extractor:
De EfficientNetB0 wordt ingeladen zonder de top layer, want de top layer is de classification layer. We hebben alleen de features nodig. Het model wordt gefreeze, zodat het model niet update wordt tijdens het trainen. Het is al pre-trained met een groot dataset van imagenet. Daarna wordt de output reshape voor de encoder.
De encoder bevat:
De decoder bevat:
def get_cnn_model():
base_model = efficientnet.EfficientNetB0(
input_shape=(*IMAGE_SIZE, 3),
include_top=False,
weights="imagenet",
)
# We freeze our feature extractor
base_model.trainable = False
base_model_out = base_model.output
base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
cnn_model = keras.models.Model(base_model.input, base_model_out)
return cnn_model
class TransformerEncoderBlock(layers.Layer):
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.dense_dim = dense_dim
self.num_heads = num_heads
self.attention_1 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim, dropout=0.0
)
self.layernorm_1 = layers.LayerNormalization()
self.layernorm_2 = layers.LayerNormalization()
self.dense_1 = layers.Dense(embed_dim, activation="relu")
def call(self, inputs, training, mask=None):
inputs = self.layernorm_1(inputs)
inputs = self.dense_1(inputs)
attention_output_1 = self.attention_1(
query=inputs,
value=inputs,
key=inputs,
attention_mask=None,
training=training,
)
out_1 = self.layernorm_2(inputs + attention_output_1)
return out_1
class PositionalEmbedding(layers.Layer):
def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
super().__init__(**kwargs)
self.token_embeddings = layers.Embedding(
input_dim=vocab_size, output_dim=embed_dim
)
self.position_embeddings = layers.Embedding(
input_dim=sequence_length, output_dim=embed_dim
)
self.sequence_length = sequence_length
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))
def call(self, inputs):
length = tf.shape(inputs)[-1]
positions = tf.range(start=0, limit=length, delta=1)
embedded_tokens = self.token_embeddings(inputs)
embedded_tokens = embedded_tokens * self.embed_scale
embedded_positions = self.position_embeddings(positions)
return embedded_tokens + embedded_positions
def compute_mask(self, inputs, mask=None):
return tf.math.not_equal(inputs, 0)
class TransformerDecoderBlock(layers.Layer):
def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.ff_dim = ff_dim
self.num_heads = num_heads
self.attention_1 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
)
self.attention_2 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
)
self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
self.ffn_layer_2 = layers.Dense(embed_dim)
self.layernorm_1 = layers.LayerNormalization()
self.layernorm_2 = layers.LayerNormalization()
self.layernorm_3 = layers.LayerNormalization()
self.embedding = PositionalEmbedding(
embed_dim=EMBED_DIM,
sequence_length=SEQ_LENGTH,
vocab_size=VOCAB_SIZE,
)
self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
self.dropout_1 = layers.Dropout(0.3)
self.dropout_2 = layers.Dropout(0.5)
self.supports_masking = True
def call(self, inputs, encoder_outputs, training, mask=None):
inputs = self.embedding(inputs)
causal_mask = self.get_causal_attention_mask(inputs)
if mask is not None:
padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
combined_mask = tf.minimum(combined_mask, causal_mask)
attention_output_1 = self.attention_1(
query=inputs,
value=inputs,
key=inputs,
attention_mask=combined_mask,
training=training,
)
out_1 = self.layernorm_1(inputs + attention_output_1)
attention_output_2 = self.attention_2(
query=out_1,
value=encoder_outputs,
key=encoder_outputs,
attention_mask=padding_mask,
training=training,
)
out_2 = self.layernorm_2(out_1 + attention_output_2)
ffn_out = self.ffn_layer_1(out_2)
ffn_out = self.dropout_1(ffn_out, training=training)
ffn_out = self.ffn_layer_2(ffn_out)
ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
ffn_out = self.dropout_2(ffn_out, training=training)
preds = self.out(ffn_out)
return preds
def get_causal_attention_mask(self, inputs):
input_shape = tf.shape(inputs)
batch_size, sequence_length = input_shape[0], input_shape[1]
i = tf.range(sequence_length)[:, tf.newaxis]
j = tf.range(sequence_length)
mask = tf.cast(i >= j, dtype="int32")
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
mult = tf.concat(
[
tf.expand_dims(batch_size, -1),
tf.constant([1, 1], dtype=tf.int32),
],
axis=0,
)
return tf.tile(mask, mult)
class ImageCaptioningModel(keras.Model):
def __init__(
self,
cnn_model,
encoder,
decoder,
num_captions_per_image=1,
image_aug=None,
):
super().__init__()
self.cnn_model = cnn_model
self.encoder = encoder
self.decoder = decoder
self.loss_tracker = keras.metrics.Mean(name="loss")
self.acc_tracker = keras.metrics.Mean(name="accuracy")
self.num_captions_per_image = num_captions_per_image
self.image_aug = image_aug
def calculate_loss(self, y_true, y_pred, mask):
loss = self.loss(y_true, y_pred)
mask = tf.cast(mask, dtype=loss.dtype)
loss *= mask
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
def calculate_accuracy(self, y_true, y_pred, mask):
accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
accuracy = tf.math.logical_and(mask, accuracy)
accuracy = tf.cast(accuracy, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
encoder_out = self.encoder(img_embed, training=training)
batch_seq_inp = batch_seq[:, :-1]
batch_seq_true = batch_seq[:, 1:]
mask = tf.math.not_equal(batch_seq_true, 0)
batch_seq_pred = self.decoder(
batch_seq_inp, encoder_out, training=training, mask=mask
)
loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
return loss, acc
def train_step(self, batch_data):
batch_img, batch_seq = batch_data
batch_loss = 0
batch_acc = 0
if self.image_aug:
batch_img = self.image_aug(batch_img)
# 1. Get image embeddings
img_embed = self.cnn_model(batch_img)
# 2. Pass caption to the decoder
# along with the encoder outputs and compute the loss as well as accuracy
# for each caption.
for i in range(self.num_captions_per_image):
with tf.GradientTape() as tape:
loss, acc = self._compute_caption_loss_and_acc(
img_embed, batch_seq[:, i, :], training=True
)
# 3. Update loss and accuracy
batch_loss += loss
batch_acc += acc
# 4. Get the list of all the trainable weights
train_vars = (
self.encoder.trainable_variables + self.decoder.trainable_variables
)
# 5. Get the gradients
grads = tape.gradient(loss, train_vars)
# 6. Update the trainable weights
self.optimizer.apply_gradients(zip(grads, train_vars))
# 7. Update the trackers
batch_acc /= float(self.num_captions_per_image)
self.loss_tracker.update_state(batch_loss)
self.acc_tracker.update_state(batch_acc)
# 8. Return the loss and accuracy values
return {
"loss": self.loss_tracker.result(),
"acc": self.acc_tracker.result(),
}
def test_step(self, batch_data):
batch_img, batch_seq = batch_data
batch_loss = 0
batch_acc = 0
# 1. Get image embeddings
img_embed = self.cnn_model(batch_img)
# 2. Pass caption to the decoder
# along with the encoder outputs and compute the loss as well as accuracy
# for each caption.
for i in range(self.num_captions_per_image):
loss, acc = self._compute_caption_loss_and_acc(
img_embed, batch_seq[:, i, :], training=False
)
# 3. Update batch loss and batch accuracy
batch_loss += loss
batch_acc += acc
batch_acc /= float(self.num_captions_per_image)
# 4. Update the trackers
self.loss_tracker.update_state(batch_loss)
self.acc_tracker.update_state(batch_acc)
# 5. Return the loss and accuracy values
return {
"loss": self.loss_tracker.result(),
"acc": self.acc_tracker.result(),
}
@property
def metrics(self):
# We need to list our metrics here so the `reset_states()` can be
# called automatically.
return [self.loss_tracker, self.acc_tracker]
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5 16705208/16705208 [==============================] - 1s 0us/step
caption_model = ImageCaptioningModel(
cnn_model=cnn_model,
encoder=encoder,
decoder=decoder,
image_aug=image_augmentation,
)
# Define the loss function
cross_entropy = keras.losses.SparseCategoricalCrossentropy(
from_logits=False,
reduction='none',
)
# EarlyStopping criteria
early_stopping = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
# Learning Rate Scheduler for the optimizer
class LRSchedule(keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, post_warmup_learning_rate, warmup_steps):
super().__init__()
self.post_warmup_learning_rate = post_warmup_learning_rate
self.warmup_steps = warmup_steps
def __call__(self, step):
global_step = tf.cast(step, tf.float32)
warmup_steps = tf.cast(self.warmup_steps, tf.float32)
warmup_progress = global_step / warmup_steps
warmup_learning_rate = self.post_warmup_learning_rate * warmup_progress
return tf.cond(
global_step < warmup_steps,
lambda: warmup_learning_rate,
lambda: self.post_warmup_learning_rate,
)
# Create a learning rate schedule
num_train_steps = len(train_dataset) * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
# Compile the model
caption_model.compile(optimizer='adam', loss=cross_entropy)
# Fit the model
caption_model.fit(
train_dataset,
epochs=EPOCHS,
validation_data=valid_dataset,
callbacks=[early_stopping],
)
Epoch 1/30 68/68 [==============================] - 107s 1s/step - loss: 3.9434 - acc: 0.2152 - val_loss: 2.5418 - val_acc: 0.4547 Epoch 2/30 68/68 [==============================] - 57s 832ms/step - loss: 2.4169 - acc: 0.4555 - val_loss: 2.1974 - val_acc: 0.4991 Epoch 3/30 68/68 [==============================] - 61s 897ms/step - loss: 2.1418 - acc: 0.4952 - val_loss: 2.0733 - val_acc: 0.5217 Epoch 4/30 68/68 [==============================] - 60s 879ms/step - loss: 1.9983 - acc: 0.5123 - val_loss: 2.0223 - val_acc: 0.5272 Epoch 5/30 68/68 [==============================] - 72s 1s/step - loss: 1.8923 - acc: 0.5291 - val_loss: 1.9937 - val_acc: 0.5355 Epoch 6/30 68/68 [==============================] - 66s 982ms/step - loss: 1.7966 - acc: 0.5422 - val_loss: 1.9563 - val_acc: 0.5442 Epoch 7/30 68/68 [==============================] - 63s 915ms/step - loss: 1.7197 - acc: 0.5573 - val_loss: 1.9428 - val_acc: 0.5490 Epoch 8/30 68/68 [==============================] - 64s 929ms/step - loss: 1.6550 - acc: 0.5688 - val_loss: 1.9676 - val_acc: 0.5447 Epoch 9/30 68/68 [==============================] - 50s 738ms/step - loss: 1.6161 - acc: 0.5740 - val_loss: 1.9661 - val_acc: 0.5488 Epoch 10/30 68/68 [==============================] - 49s 722ms/step - loss: 1.5715 - acc: 0.5827 - val_loss: 1.9554 - val_acc: 0.5501
<keras.src.callbacks.History at 0x7baf7ba9ee30>
Resultaten van image captioning
vocab = vectorization.get_vocabulary()
index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = SEQ_LENGTH - 1
valid_images = list(valid_data.keys())
def generate_caption():
# Select a random image from the validation dataset
sample_img = np.random.choice(valid_images)
# Read the image from the disk
sample_img = decode_and_resize(sample_img)
img = sample_img.numpy().clip(0, 255).astype(np.uint8)
plt.imshow(img)
plt.show()
# Pass the image to the CNN
img = tf.expand_dims(sample_img, 0)
img = caption_model.cnn_model(img)
# Pass the image features to the Transformer encoder
encoded_img = caption_model.encoder(img, training=False)
# Generate the caption using the Transformer decoder
decoded_caption = "<start> "
for i in range(max_decoded_sentence_length):
tokenized_caption = vectorization([decoded_caption])[:, :-1]
mask = tf.math.not_equal(tokenized_caption, 0)
predictions = caption_model.decoder(
tokenized_caption, encoded_img, training=False, mask=mask
)
sampled_token_index = np.argmax(predictions[0, i, :])
sampled_token = index_lookup[sampled_token_index]
if sampled_token == "<end>":
break
decoded_caption += " " + sampled_token
decoded_caption = decoded_caption.replace("<start> ", "")
decoded_caption = decoded_caption.replace(" <end>", "").strip()
print("Predicted Caption: ", decoded_caption)
# Check predictions for a few samples
generate_caption()
generate_caption()
generate_caption()
generate_caption()
generate_caption()
Predicted Caption: solid red longsleeved jumper soft ribknit cotton blend ribbing around neckline cuffs hem end seam waist
Predicted Caption: solid black longsleeved jumper soft fineknit cotton blend ribbing around neckline cuffs hem end seam waist
Predicted Caption: solid black trousers soft viscose jersey elasticated waist side pockets straight wide legs end seam waist
Predicted Caption: solid white top ribbed jersey narrow shoulder straps vneck front back end seam waist end seam
Predicted Caption: solid white sleeveless top soft jersey made cotton blend vneck front back end seam centre back
De 1e prediction heeft 'red' en moet 'green' zijn. Bij de 4e en 5e ziet het niet als v-neck uit.
Verder met een lagere Sequence Length geprobeerd, omdat de captions best breed is geformuleerd. Binnen 12 woorden kan ik uit de caption al begrijpen wat er beschreven wordt. We willen ook niet te laag gaan, want anders gaat het model overfitten en slechter presteren op nieuwe data (ChatGPT: Small Sequence Length Risks).
We hebben gekozen om 12 en 15 uit te proberen.
SEQ_LENGTH = 12
SEQ_LENGTH = 12
# Load the dataset
captions_mapping, text_data, image_paths = load_captions_data("/content/Captions/captions.txt")
# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))
Number of training samples: 1005 Number of validation samples: 252
strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~1234567890"
vectorization = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode="int",
output_sequence_length=SEQ_LENGTH,
standardize=custom_standardization,
)
vectorization.adapt(text_data)
# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
caption_model = ImageCaptioningModel(
cnn_model=cnn_model,
encoder=encoder,
decoder=decoder,
image_aug=image_augmentation,
)
# Create a learning rate schedule
num_train_steps = len(train_dataset) * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
# Compile the model
caption_model.compile(optimizer='adam', loss=cross_entropy)
# Fit the model
caption_model.fit(
train_dataset,
epochs=EPOCHS,
validation_data=valid_dataset,
callbacks=[early_stopping],
)
Epoch 1/30 16/16 [==============================] - 30s 1s/step - loss: 6.0503 - acc: 0.0970 - val_loss: 4.0908 - val_acc: 0.2758 Epoch 2/30 16/16 [==============================] - 10s 576ms/step - loss: 3.5869 - acc: 0.3192 - val_loss: 3.1824 - val_acc: 0.3863 Epoch 3/30 16/16 [==============================] - 14s 853ms/step - loss: 2.8338 - acc: 0.4169 - val_loss: 2.7448 - val_acc: 0.4421 Epoch 4/30 16/16 [==============================] - 13s 843ms/step - loss: 2.4068 - acc: 0.4642 - val_loss: 2.5464 - val_acc: 0.4565 Epoch 5/30 16/16 [==============================] - 11s 716ms/step - loss: 2.1250 - acc: 0.5103 - val_loss: 2.4299 - val_acc: 0.4725 Epoch 6/30 16/16 [==============================] - 13s 838ms/step - loss: 1.9229 - acc: 0.5344 - val_loss: 2.3796 - val_acc: 0.4859 Epoch 7/30 16/16 [==============================] - 14s 879ms/step - loss: 1.7534 - acc: 0.5624 - val_loss: 2.3629 - val_acc: 0.4862 Epoch 8/30 16/16 [==============================] - 13s 846ms/step - loss: 1.6121 - acc: 0.5849 - val_loss: 2.3367 - val_acc: 0.4991 Epoch 9/30 16/16 [==============================] - 12s 776ms/step - loss: 1.4845 - acc: 0.6126 - val_loss: 2.3782 - val_acc: 0.4871 Epoch 10/30 16/16 [==============================] - 13s 815ms/step - loss: 1.3876 - acc: 0.6336 - val_loss: 2.3794 - val_acc: 0.4827 Epoch 11/30 16/16 [==============================] - 14s 846ms/step - loss: 1.2998 - acc: 0.6473 - val_loss: 2.4068 - val_acc: 0.4886
<keras.src.callbacks.History at 0x7cde8e2e2dd0>
Resultaten van image captioning
vocab = vectorization.get_vocabulary()
index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = SEQ_LENGTH - 1
valid_images = list(valid_data.keys())
# Check predictions for a few samples
generate_caption()
generate_caption()
generate_caption()
generate_caption()
generate_caption()
Predicted Caption: solid black fitted top ribbed jersey deep neckline narrow shoulder straps
Predicted Caption: solid black fitted top soft ribbed jersey vneck short sleeves end
Predicted Caption: solid light beige top soft viscose jersey vneck short sleeves end
Predicted Caption: melange dark grey jumper soft fine knit containing wool ribbing around
Predicted Caption: solid white vest top ribbed jersey made cotton blend end flounces
De predictions zijn best acceptabel hier. Alleen bij de 2e ben ik zelf niet zeker of het wel of geen v-neck is, want het kan dat het round gedeelte de achterste gedeelte is. Maar over het algemeen zou ik het captionen met de eerste 4-6 woorden die beschreven is.
SEQ_LENGTH = 15
SEQ_LENGTH = 15
# Load the dataset
captions_mapping, text_data, image_paths = load_captions_data("/content/Captions/captions.txt")
# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))
Number of training samples: 2810 Number of validation samples: 703
vectorization = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode="int",
output_sequence_length=SEQ_LENGTH,
standardize=custom_standardization,
)
vectorization.adapt(text_data)
# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
caption_model = ImageCaptioningModel(
cnn_model=cnn_model,
encoder=encoder,
decoder=decoder,
image_aug=image_augmentation,
)
# Create a learning rate schedule
num_train_steps = len(train_dataset) * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
# Compile the model
caption_model.compile(optimizer='adam', loss=cross_entropy)
# Fit the model
caption_model.fit(
train_dataset,
epochs=EPOCHS,
validation_data=valid_dataset,
callbacks=[early_stopping],
)
Epoch 1/30 44/44 [==============================] - 66s 1s/step - loss: 4.4638 - acc: 0.1738 - val_loss: 2.8773 - val_acc: 0.4100 Epoch 2/30 44/44 [==============================] - 34s 760ms/step - loss: 2.5804 - acc: 0.4407 - val_loss: 2.3814 - val_acc: 0.4741 Epoch 3/30 44/44 [==============================] - 36s 830ms/step - loss: 2.1961 - acc: 0.4880 - val_loss: 2.2263 - val_acc: 0.4918 Epoch 4/30 44/44 [==============================] - 35s 792ms/step - loss: 1.9992 - acc: 0.5222 - val_loss: 2.1372 - val_acc: 0.5090 Epoch 5/30 44/44 [==============================] - 35s 810ms/step - loss: 1.8662 - acc: 0.5380 - val_loss: 2.0783 - val_acc: 0.5227 Epoch 6/30 44/44 [==============================] - 33s 762ms/step - loss: 1.7593 - acc: 0.5585 - val_loss: 2.0557 - val_acc: 0.5224 Epoch 7/30 44/44 [==============================] - 31s 702ms/step - loss: 1.6617 - acc: 0.5691 - val_loss: 2.0492 - val_acc: 0.5265 Epoch 8/30 44/44 [==============================] - 35s 787ms/step - loss: 1.5709 - acc: 0.5824 - val_loss: 2.0568 - val_acc: 0.5256 Epoch 9/30 44/44 [==============================] - 34s 786ms/step - loss: 1.5091 - acc: 0.5964 - val_loss: 2.0594 - val_acc: 0.5250 Epoch 10/30 44/44 [==============================] - 30s 669ms/step - loss: 1.4474 - acc: 0.6049 - val_loss: 2.0785 - val_acc: 0.5268
<keras.src.callbacks.History at 0x7cde8e47ba90>
Resultaten van image captioning
vocab = vectorization.get_vocabulary()
index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = SEQ_LENGTH - 1
valid_images = list(valid_data.keys())
# Check predictions for a few samples
generate_caption()
generate_caption()
generate_caption()
generate_caption()
generate_caption()
Predicted Caption: embroidery grey top soft cotton jersey embroidery top short sleeves embroidery cuffs end end
Predicted Caption: pattern red sleeveless top soft viscose jersey lace yoke back end flounces end flounces
Predicted Caption: solid black short dress soft viscose jersey short sleeves seam waist bellshaped skirt unlined
Predicted Caption: solid light beige straightcut tshirt soft cotton jersey round neckline end flounces top end
Predicted Caption: solid red vneck top soft ribknit viscose blend narrow shoulder straps end flounces front
In dit geval kunnen we bij de 1e en 2e prediction al zien dat het kleur en het type mouw verkeerd is predict.
Nou we krijgen nogsteeds niet de beste scoren uit. Zoals eerder verteld als de Sequence Length te laag is, kan het overfitten. We hebben gekozen voor 30, want dit zit ook best dichtbij de maximale aantal tokens die de data bevat zonder stopwords.
SEQ_LENGTH = 30
# Load the dataset
captions_mapping, text_data, image_paths = load_captions_data("/content/Captions/captions.txt")
# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(valid_data))
Number of training samples: 9937 Number of validation samples: 2485
vectorization = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode="int",
output_sequence_length=SEQ_LENGTH,
standardize=custom_standardization,
)
vectorization.adapt(text_data)
# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5 16705208/16705208 [==============================] - 0s 0us/step
caption_model = ImageCaptioningModel(
cnn_model=cnn_model,
encoder=encoder,
decoder=decoder,
image_aug=image_augmentation,
)
# Create a learning rate schedule
num_train_steps = len(train_dataset) * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
# Compile the model
caption_model.compile(optimizer='adam', loss=cross_entropy)
# Fit the model
caption_model.fit(
train_dataset,
epochs=EPOCHS,
validation_data=valid_dataset,
callbacks=[early_stopping],
)
Epoch 1/30 156/156 [==============================] - 152s 830ms/step - loss: 3.1265 - acc: 0.2928 - val_loss: 2.2184 - val_acc: 0.5021 Epoch 2/30 156/156 [==============================] - 120s 768ms/step - loss: 2.1524 - acc: 0.4983 - val_loss: 2.0069 - val_acc: 0.5338 Epoch 3/30 156/156 [==============================] - 140s 895ms/step - loss: 1.9773 - acc: 0.5269 - val_loss: 1.9099 - val_acc: 0.5529 Epoch 4/30 156/156 [==============================] - 120s 767ms/step - loss: 1.8845 - acc: 0.5405 - val_loss: 1.8737 - val_acc: 0.5593 Epoch 5/30 156/156 [==============================] - 138s 886ms/step - loss: 1.8136 - acc: 0.5514 - val_loss: 1.8577 - val_acc: 0.5611 Epoch 6/30 156/156 [==============================] - 121s 778ms/step - loss: 1.7562 - acc: 0.5611 - val_loss: 1.8272 - val_acc: 0.5699 Epoch 7/30 156/156 [==============================] - 137s 879ms/step - loss: 1.7127 - acc: 0.5684 - val_loss: 1.8187 - val_acc: 0.5689 Epoch 8/30 156/156 [==============================] - 115s 738ms/step - loss: 1.6726 - acc: 0.5760 - val_loss: 1.8105 - val_acc: 0.5736 Epoch 9/30 156/156 [==============================] - 121s 774ms/step - loss: 1.6375 - acc: 0.5806 - val_loss: 1.8100 - val_acc: 0.5726 Epoch 10/30 156/156 [==============================] - 140s 896ms/step - loss: 1.6097 - acc: 0.5853 - val_loss: 1.8084 - val_acc: 0.5745 Epoch 11/30 156/156 [==============================] - 139s 893ms/step - loss: 1.5799 - acc: 0.5922 - val_loss: 1.8082 - val_acc: 0.5743 Epoch 12/30 156/156 [==============================] - 139s 895ms/step - loss: 1.5573 - acc: 0.5964 - val_loss: 1.8059 - val_acc: 0.5765 Epoch 13/30 156/156 [==============================] - 140s 899ms/step - loss: 1.5334 - acc: 0.6004 - val_loss: 1.8103 - val_acc: 0.5751 Epoch 14/30 156/156 [==============================] - 140s 896ms/step - loss: 1.5112 - acc: 0.6036 - val_loss: 1.8176 - val_acc: 0.5747 Epoch 15/30 156/156 [==============================] - 122s 781ms/step - loss: 1.4892 - acc: 0.6066 - val_loss: 1.8113 - val_acc: 0.5769
<keras.src.callbacks.History at 0x7ce11c22dc00>
We zien dat het score hier een klein stuk beter is dan de vorige modellen.
Resultaten van image captioning
vocab = vectorization.get_vocabulary()
index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = SEQ_LENGTH - 1
valid_images = list(valid_data.keys())
def generate_caption():
# Select a random image from the validation dataset
sample_img = np.random.choice(valid_images)
# Read the image from the disk
sample_img = decode_and_resize(sample_img)
img = sample_img.numpy().clip(0, 255).astype(np.uint8)
plt.imshow(img)
plt.show()
# Pass the image to the CNN
img = tf.expand_dims(sample_img, 0)
img = caption_model.cnn_model(img)
# Pass the image features to the Transformer encoder
encoded_img = caption_model.encoder(img, training=False)
# Generate the caption using the Transformer decoder
decoded_caption = "<start> "
for i in range(max_decoded_sentence_length):
tokenized_caption = vectorization([decoded_caption])[:, :-1]
mask = tf.math.not_equal(tokenized_caption, 0)
predictions = caption_model.decoder(
tokenized_caption, encoded_img, training=False, mask=mask
)
sampled_token_index = np.argmax(predictions[0, i, :])
sampled_token = index_lookup[sampled_token_index]
if sampled_token == "<end>":
break
decoded_caption += " " + sampled_token
decoded_caption = decoded_caption.replace("<start> ", "")
decoded_caption = decoded_caption.replace(" <end>", "").strip()
print("Predicted Caption: ", decoded_caption)
# Check predictions for a few samples
generate_caption()
generate_caption()
generate_caption()
generate_caption()
generate_caption()
Predicted Caption: pattern black trousers patterned weave wide elastication back waist side pockets wide legs end pressstuds end pressstuds end pressstuds end pressstuds end pressstuds end pressstuds end pressstuds end pressstuds
Predicted Caption: solid white shorts widewale corduroy high waist zip fly concealed hookandeye fastening side pockets welt back pockets end pressstuds end pressstuds end pressstuds end lined end end end end
Predicted Caption: solid dark blue straightcut shirt jacket soft twill notch lapels buttons front flap chest pocket button cuffs rounded hem end end end end end end end end end end
Predicted Caption: pattern dark blue blouse airy weave vneck short sleeves narrow ties end seam woven fabric lined end flounces end end end end end end end end end end end
Predicted Caption: solid black shorts woven fabric high waist concealed zip hookandeye fastener side pockets welt back pockets sewnin turnups hems end pressstuds front back lined end pressstuds end pressstuds end
Hier zien we bij elke einde een spam van zelfde woord(en). Bij de 2e prediction ziet het meer als een skirt uit dan een short. Bij de 4e prediction geeft het 'v-neck' en 'short sleeves' aan, maar type neck komt niet voor op de foto en type mouw ziet niet kort uit. Bij de 5e is er geen 'concealed zip hookandeye'.
SEQ_LENGTH = 12
# Load the dataset
captions_mapping, text_data, image_paths = load_captions_data("/content/Captions/captions.txt")
# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(captions_mapping)
strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~1234567890"
vectorization = TextVectorization(
max_tokens=VOCAB_SIZE,
output_mode="int",
output_sequence_length=SEQ_LENGTH,
standardize=custom_standardization,
)
vectorization.adapt(text_data)
# Pass the list of images and the list of corresponding captions
train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
cnn_model = get_cnn_model()
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
caption_model = ImageCaptioningModel(
cnn_model=cnn_model,
encoder=encoder,
decoder=decoder,
image_aug=image_augmentation,
)
# Create a learning rate schedule
num_train_steps = len(train_dataset) * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
# Compile the model
caption_model.compile(optimizer='adam', loss=cross_entropy)
# Fit the model
caption_model.fit(
train_dataset,
epochs=EPOCHS,
validation_data=valid_dataset,
callbacks=[early_stopping],
)
# Functie voor implentatie waarbij het een image neemt als input.
def image_captioning(image_path):
# Read the image from the disk
sample_img = decode_and_resize(image_path)
img = sample_img.numpy().clip(0, 255).astype(np.uint8)
plt.imshow(img)
plt.show()
# Pass the image to the CNN
img = tf.expand_dims(sample_img, 0)
img = caption_model.cnn_model(img)
# Pass the image features to the Transformer encoder
encoded_img = caption_model.encoder(img, training=False)
# Generate the caption using the Transformer decoder
decoded_caption = "<start>"
for i in range(max_decoded_sentence_length):
tokenized_caption = vectorization([decoded_caption])[:, :-1]
mask = tf.math.not_equal(tokenized_caption, 0)
predictions = caption_model.decoder(
tokenized_caption, encoded_img, training=False, mask=mask
)
sampled_token_index = np.argmax(predictions[0, i, :])
sampled_token = index_lookup[sampled_token_index]
if sampled_token == "<end>":
break
decoded_caption += " " + sampled_token
decoded_caption = decoded_caption.replace("<start> ", "")
decoded_caption = decoded_caption.replace(" <end>", "").strip()
print("Predicted Caption: ", decoded_caption)
%pip install peft datasets accelerate git+https://github.com/huggingface/diffusers
%pip install argparse
%pip install pathlib
%pip install datasets
%pip install numpy
%pip install torch
%pip install transformers
%pip install accelerate
%pip install huggingface_hub
%pip install packaging
%pip install peft
%pip install tqdm
%pip install diffusers
%pip install wandb
%pip install torch
%pip install torchvision
import os
from diffusers import AutoPipelineForText2Image
import torch
os.environ['MODEL_NAME'] = 'runwayml/stable-diffusion-v1-5'
os.environ['DATASET_NAME'] = 'tomytjandra/h-and-m-fashion-caption-12k'
%accelerate launch train_text_to_image_lora.py --pretrained_model_name_or_path=$MODEL_NAME --dataset_name=$DATASET_NAME --caption_column="text" --center_crop --random_flip --resolution=128 --train_batch_size=32 --num_train_epochs=5
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
pipeline.load_lora_weights("sd-model-finetuned-loraa", weight_name="pytorch_lora_weights.safetensors")
image = pipeline("white shirt with navy blue vertical stripes").images[0]
image.save("shirt.png")
Het pretrained model dat wij gebruiken is runwayml/stable-diffusion-v1-5, dit is een open source diffusion model van runwayml. Stable diffusion is een model dat bekend staat om goede image generatie. Daarnaast is het 1 van de bekenste bruikbare modellen en zal dus goede documentatie hebben. Dus een goed model met een documentatie en een community achter zich is waarom wij dit model hebben gekozen.
stable diffusion werkt zoals de naam impliceert met diffusion, diffusion is een manier waarop images gegenereerd kunnen worden. diffusion maakt gebruik van noise, noise is de willekeur van de pixels in een image. Als je een image hebt en je veranderd elke pixel naar een random andere kleur dan blijft er niks van je image over. Wat diffusion doet is het probeert van een image met alleen maar noise te gaan naar een image die past bij hetgeen dat er geprompt is. Dit wordt gedaan door een noise predicter, deze predictor kan kleine hoeveelheden noise weghalen om zo dichter bij de uiteindelijke image te komen. Als je deze stap vaak genoeg uitvoert krijg je een image uit je model.
In stable diffusion en diverse andere populaire diffusion modellen wordt U-Net toegepast als noise predictor. U-Net maakt gebruik van een serie van een CNN layer gevolgd door een Relu layer gevolgt door een maxpooling layer. Dit gebeurd opnieuw en opnieuw. Hierbij wordt door de CNNs de image steeds gedownsampeld tot een bepaald punt waar de image alleen nog maar geupsampeld wordt. dit geeft de vorm van een U, vandaar U-Net.
Het trainen is gedaan met een script uit de github repo https://github.com/huggingface/diffusers deze repo is van huggingface zelf en wordt gebruikt voor het genereren van images. Uit deze repo is het .py bestand train_text_to_image_lora.py gekopieerd en toegevoegd aan onze repo. Dit bestand is namelijk verantwoordelijk voor het fine-tunen van het model doormiddel van onze eigen data. Het script gebruikt deze file en onze dataset om het model stable-diffusion van runwayml te fine-tunen zodat het images genereerd van kledingstukken zoals wij dat willen. Het trainen van het model wordt gedaan samen met data-augmentatie, door de parameter --random_flip te gebruiken worden de afbeeldingen willekeurig geflipt zodat er meer diverse data in de training zit. Ook worden de images gecropt naar het midden zodat het kledingstuk altijd mooi in frame is. Dit alles zorgt voor een goede training van het model. Het model traint vervolgens voor 5 epochs met 32 afbeeldingen als batch_size.
Bij de loss van een diffusion model word de verwachtte noise en de predicted noise met elkaar vergeleken om te kijken hoe goed het model is in het geven van noise aan een image. aangezien een image bestaat uit een matrix van nummers wordt er dus ook een matrix van de verwachtte noise en de predicted noise vergeleken door alle waardes uit beide matrixen van elkaar af te trekken en de verschillen tot de 2e macht te berekenen. daarna word het gemiddelde berekend van de waardes en dat is de enkele loss waarde.
Dus wordt eigenlijk gewoon de MSE van de noise gepakt als loss.
Met 5 epochs bleek er een goed resultaat uit te komen, terwijl de training tijd behapbaar blijft.
import tkinter as tk
from tkinter import filedialog
from PIL import Image, ImageTk
# Function to handle the button click for loading an image
def on_load_image_click():
# Open a file dialog to select an image file
file_path = filedialog.askopenfilename(
filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.gif;*.bmp")]
)
if file_path:
# Open the image file
img = Image.open(file_path)
# Resize the image to fit the label, if necessary
img = img.resize((250, 250))
# Convert the image to a format Tkinter can use
img_tk = ImageTk.PhotoImage(img)
# Update the label with the new image
image_label.config(image=img_tk)
image_label.image = img_tk # Keep a reference to avoid garbage collection
# Get the text to display under the image
text_label.config(text='image about ')
# Create the main window
root = tk.Tk()
root.title("Image and Text Display App")
# Create a label to display instructions
instruction_label = tk.Label(root, text="Select an image to get the prompt")
instruction_label.pack(pady=10)
# Create a button to load an image
load_image_button = tk.Button(root, text="Load Image", command=on_load_image_click)
load_image_button.pack(pady=10)
# Create a label to display the image
image_label = tk.Label(root)
image_label.pack(pady=10)
# Create a label to display the text under the image
text_label = tk.Label(root, text="")
text_label.pack(pady=10)
# Start the Tkinter event loop
root.mainloop()
object detectie
De uitdagingen waar we bij het yolov5 model tegen aanliepen is dat het een groot model is wat veel rekenkracht gebruikt en daardoor ook veel tijd kost om te runnen. Ondanks dat we veel hebben geprobeerd, was het helaas niet mogelijk om dit model te runnen (op mijn laptop).
Image Captioning
Een grote uitdaging is dat colab niet genoeg geheugen heeft in de gratis versie, waarbij ik niet met grotere image size voor het model kon uitvoeren.
De huggingface dataset kan wel tot tensorflow omgezet worden, maar de dataset is dan niet hetzelfde als hoe het voorbereid was bij de image captioning notebook van de les.
beeldgeneratie
Bij beeldgeneratie was het vooral lastig dat het model enorm lang duurde om te trainen, dit zorgde ervoor dat ik moeilijk kon zien of het model goed werkte. Dit zorgde ervoor dat ik meerdere keren erg lang moest wachten om vervolgens een fout op te lossen en dan weer lang te moeten wachten.
De modellen die wij hierboven hebben gebouwd kunnen bijvoorbeeld worden toegepast in kledingwinkels. Zij kunnen bij de self-check out een camera hangen waar de mensen alleen het artikel voor hoeven te houden zodat het model kan zien welk artikel het is en deze toe kan voegen aan de winkelwagen zo kunnen de mensen meteen betalen. Ook kunnen de modellen helpen bij het virtueel passen van kleding en aanbevelingen doen op basis van stijl.
De afbeeldingen en data is erg netjes, te netjes zelfs. in veel situaties zou je zulke nette data niet tegenkomen. Je kan dit soort afbeeldingen data alleen krijgen in erg gestructureerde situaties, bijvoorbeeld als een persoon het zo neerlegd. Het zou gebruikt kunnen worden als kwaliteits controle in een fabriek waar kleding gemaakt word. zo kan er makkelijk gefilterd worden op soorten kledingstuk en kan er efficienter gekeken worden naar of de kledingstukken er goed uit zien.
We hebben geen resultaten bij objectdetectie, dus we kunnen hierop niet oordelen. Voor image captioning is het resultaat niet helemaal betrouwbaar. Het doet af en toe wel goed als het om de attributen over gaat. Voor beeldgeneratie zijn de resultaten best betrouwbaar. Bepaalde foto's zien er wel afgeknipt eruit, maar over het algemeen is het acceptabel.